Prepare

library(tidyverse, warn.conflicts = F)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2, warn.conflicts = F)
library(purrr) # for functional programming
df <- read.csv("data/netflix_titles.csv", na.strings = c("", "NA"))

Get country information

grouped <- df %>%
  filter(!is.na(country) & country != "") %>% # remove the NA country
  filter(!is.na(date_added)) %>% # remove the NA date_added
  mutate(year_added = year(parse_date(date_added, "%B %d, %Y"))) %>% # extract year_added
  mutate(country = strsplit(as.character(country), ",")) %>%
  mutate(country = lapply(country, trimws)) %>%
  unnest(country) %>%
  group_by(country, year_added, type) %>%
  summarise(cnt = n()) %>%
  filter(!is.na(country) & country != "")
## `summarise()` has grouped output by 'country', 'year_added'. You can override
## using the `.groups` argument.

Grouping

by_country_type <- grouped %>%
  group_by(country, type) %>%
  summarise(cnt = sum(cnt))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

Movie Distribution by country

plot_ly(by_country_type %>% filter(type == "Movie"),
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~cnt,
  color = ~cnt,
  colors = "OrRd",
  colorbar = list(title = "Counts"),
  text = ~ paste(country, "<br>Counts: ", cnt)
)

TV Show Distribution by contry

plot_ly(by_country_type %>% filter(type == "TV Show"),
  type = "choropleth",
  locations = ~country,
  locationmode = "country names",
  z = ~cnt,
  color = ~cnt,
  colors = "OrRd",
  colorbar = list(title = "Counts"),
  text = ~ paste(country, "<br>Counts: ", cnt)
)

Accum TV shows & Movie

country_accum <- grouped %>%
  group_by(country, year_added) %>%
  summarise(cnt = sum(cnt)) %>%
  group_by(country) %>%
  arrange(year_added) %>%
  mutate(accum = cumsum(cnt))
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
# TODO: dynamics graph over year
country_accum %>%
  filter(year_added == 2015) %>%
  plot_ly(
    type = "choropleth",
    locations = ~country,
    locationmode = "country names",
    z = ~accum,
    color = ~accum,
    colors = "OrRd",
    colorbar = list(title = "Accumlated Sum"),
    text = ~ paste(country, "<br>Accumlated Sum: ", accum)
  )

Gathering data

type_prop <- by_country_type %>%
  group_by(country) %>%
  mutate(prop = round(cnt / sum(cnt) * 100, 1)) %>%
  group_by(country) %>%
  summarise(total = sum(cnt), prop = prop, type = type) %>%
  as.data.frame() %>%
  top_n(20, wt = total)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
custom_order <- type_prop %>%
  arrange(desc(ifelse(type == "Movie", prop, -prop))) %>%
  select(country) %>%
  array() %>%
  flatten() %>%
  unique()
ggplot(type_prop, aes(y = factor(country, levels = custom_order), x = prop, fill = type)) +
  geom_bar(stat = "identity") +
  geom_text(
    aes(label = scales::percent(prop / 100)),
    position = position_stack(vjust = 0.5),
    color = "white",
    size = 3
  ) +
  labs(
    title = "Proportions of Movie and TV Show by Country",
    y = "Country",
    x = "Proportion (%)",
    fill = "Type"
  ) +
  scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 100)) +
  scale_fill_manual(values = c("#221f1f", "#b20710")) +
  theme_minimal()